/*******************************************************************************

Ryan Hill: ryan.hill@kellogg.northwestern.edu
Carolyn Stein: carolyn_stein@berkeley.edu
Last modified: December 2024

Inputs: 	clean_citation.dta
			stanford_panel_cites_pubmed2pubmed.dta (restricted)

Outputs: 	clean_pubmed.dta

Note: This file cannot run without restricted data (see README)

*******************************************************************************/

clear all

use "${data_clean}clean_citation.dta", clear

* Some papers have conflicting publication years; keep earliest
keep pubmedId publicationYear
bys pubmedId: egen min_year = min(publicationYear)
keep if min_year == publicationYear
duplicates drop
drop min_year

* Drop if PubMed ID is missing
drop if pubmedId == .  
	
rename pubmedId pmid
	
tempfile pdb_list
save `pdb_list'	
	
* Merge on crosswalk of citations between PubMed IDs from Web of Science
merge 1:m pmid using "${data_raw}Restricted/wos_pubmed2pubmed.dta", keep(1 3) 
keep if _m == 3
drop _m

* Extract the 3-, 5-, and 10-year citation totals. Leave missing if censored
gen pub_lag = year - pubyear
keep pmid pubyear stk_cites_nslf pub_lag
keep if inlist(pub_lag,3,5,10)
reshape wide stk_cites_nslf, i(pmid pubyear) j(pub_lag)	
	
tempfile wide_cites
save `wide_cites'		

* Merge citation counts onto full pdb list of papers
use `pdb_list', clear
merge 1:1 pmid using `wide_cites', nogen


* Impute zeros or missing depending on pubyear.
* If the year is past the citation window that we have (starting in 2015 for 
* 3-yr cites), then we set to missing. If published before then but didn't 
* merge, we treat it as a zero
sum pubyear
local max_year = r(max)

local y3= `max_year'-3
local y5= `max_year'-5
local y10= `max_year'-10

* Replace papers with no citations (so no pub year from WoS) with PDB pub year
replace pubyear = publicationYear if pubyear == . 

replace stk_cites_nslf3 = 0 if pubyear <= `y3' & stk_cites_nslf3 ==.
replace stk_cites_nslf5 = 0 if pubyear <= `y5' & stk_cites_nslf5 ==.
replace stk_cites_nslf10 = 0 if pubyear <= `y10' & stk_cites_nslf10 ==.
isid pmid
rename pmid pubmedId
	
* Drop pre-1980 papers (our WoS data does not go back this far)
drop if pubyear < 1980 // Didn't get these years from stanford data

* Drop if publication year is fully missing
drop if publicationYear == . & pubyear == . 
	
save "${data_clean}clean_pubmed.dta", replace
